#packages i will need for project
library(readr)
library(tidyverse)
library(dbplyr)
library(viridis)
library(usmap)
library(modelr)
library(plotly)
library(hrbrthemes)
library(ggridges)
library(randomForest)
library(vip)
#import dataset
dir01 <- "/Users/Kalide/Downloads/countries of the world.csv"
path01 <- file.path(dir01)
df01 <- read_csv(path01)
#rename variables
df01 <- df01 %>%
rename(Area_sq.mi=`Area (sq. mi.)`,
pop.density_sq.mi = `Pop. Density (per sq. mi.)`,
Coastline = `Coastline (coast/area ratio)`,
inf.mortality_per1000births = `Infant mortality (per 1000 births)`,
GDP_percapita = `GDP ($ per capita)`,
Literacy.pct = `Literacy (%)`,
Phones_per1000 = `Phones (per 1000)`,
Arable_pct = `Arable (%)`,
Crops_pct = `Crops (%)`,
Other_pct = `Other (%)`)
#Exploratory data analysis
tibble(df01)
## # A tibble: 227 × 20
## Country Region Population Area_sq.mi pop.density_sq.… Coastline
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan ASIA (EX.… 31056997 647500 48 0
## 2 Albania EASTERN E… 3581655 28748 125. 1.26
## 3 Algeria NORTHERN … 32930091 2381740 13.8 0.04
## 4 American Samoa OCEANIA 57794 199 290. 58.3
## 5 Andorra WESTERN E… 71201 468 152. 0
## 6 Angola SUB-SAHAR… 12127071 1246700 9.7 0.13
## 7 Anguilla LATIN AME… 13477 102 132. 59.8
## 8 Antigua & Barbuda LATIN AME… 69108 443 156 34.5
## 9 Argentina LATIN AME… 39921833 2766890 14.4 0.18
## 10 Armenia C.W. OF I… 2976372 29800 99.9 0
## # … with 217 more rows, and 14 more variables: Net migration <dbl>,
## # inf.mortality_per1000births <dbl>, GDP_percapita <dbl>, Literacy.pct <dbl>,
## # Phones_per1000 <dbl>, Arable_pct <dbl>, Crops_pct <dbl>, Other_pct <dbl>,
## # Climate <dbl>, Birthrate <dbl>, Deathrate <dbl>, Agriculture <dbl>,
## # Industry <dbl>, Service <dbl>
summary(df01)
## Country Region Population Area_sq.mi
## Length:227 Length:227 Min. :7.026e+03 Min. : 2
## Class :character Class :character 1st Qu.:4.376e+05 1st Qu.: 4648
## Mode :character Mode :character Median :4.787e+06 Median : 86600
## Mean :2.874e+07 Mean : 598227
## 3rd Qu.:1.750e+07 3rd Qu.: 441811
## Max. :1.314e+09 Max. :17075200
##
## pop.density_sq.mi Coastline Net migration
## Min. : 0.00 Min. : 0.00 Min. :-20.99000
## 1st Qu.: 29.15 1st Qu.: 0.10 1st Qu.: -0.92750
## Median : 78.80 Median : 0.73 Median : 0.00000
## Mean : 379.05 Mean : 21.17 Mean : 0.03812
## 3rd Qu.: 190.15 3rd Qu.: 10.35 3rd Qu.: 0.99750
## Max. :16271.50 Max. :870.66 Max. : 23.06000
## NA's :3
## inf.mortality_per1000births GDP_percapita Literacy.pct Phones_per1000
## Min. : 2.29 Min. : 500 Min. : 17.60 Min. : 0.2
## 1st Qu.: 8.15 1st Qu.: 1900 1st Qu.: 70.60 1st Qu.: 37.8
## Median : 21.00 Median : 5550 Median : 92.50 Median : 176.2
## Mean : 35.51 Mean : 9690 Mean : 82.84 Mean : 236.1
## 3rd Qu.: 55.70 3rd Qu.:15700 3rd Qu.: 98.00 3rd Qu.: 389.6
## Max. :191.19 Max. :55100 Max. :100.00 Max. :1035.6
## NA's :3 NA's :1 NA's :18 NA's :4
## Arable_pct Crops_pct Other_pct Climate
## Min. : 0.00 Min. : 0.000 Min. : 33.33 Min. :1.000
## 1st Qu.: 3.22 1st Qu.: 0.190 1st Qu.: 71.65 1st Qu.:2.000
## Median :10.42 Median : 1.030 Median : 85.70 Median :2.000
## Mean :13.80 Mean : 4.564 Mean : 81.64 Mean :2.139
## 3rd Qu.:20.00 3rd Qu.: 4.440 3rd Qu.: 95.44 3rd Qu.:3.000
## Max. :62.11 Max. :50.680 Max. :100.00 Max. :4.000
## NA's :2 NA's :2 NA's :2 NA's :22
## Birthrate Deathrate Agriculture Industry
## Min. : 7.29 Min. : 2.290 Min. :0.00000 Min. :0.0200
## 1st Qu.:12.67 1st Qu.: 5.910 1st Qu.:0.03775 1st Qu.:0.1930
## Median :18.79 Median : 7.840 Median :0.09900 Median :0.2720
## Mean :22.11 Mean : 9.241 Mean :0.15084 Mean :0.2827
## 3rd Qu.:29.82 3rd Qu.:10.605 3rd Qu.:0.22100 3rd Qu.:0.3410
## Max. :50.73 Max. :29.740 Max. :0.76900 Max. :0.9060
## NA's :3 NA's :4 NA's :15 NA's :16
## Service
## Min. :0.0620
## 1st Qu.:0.4293
## Median :0.5710
## Mean :0.5653
## 3rd Qu.:0.6785
## Max. :0.9540
## NA's :15
looks like there is a decent amount of climate, literacy, industry, service, and agriculture data missing.
#count number of NA's in file and visualize NAs
missing.values <- df01 %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
dplyr::summarise(num.missing = n()) %>%
filter(is.missing == T) %>%
select(-is.missing) %>%
arrange(desc(num.missing))
#plot the chart of NAs
missing.values %>%
ggplot() +
geom_bar(aes(x=reorder(key, +num.missing), y=num.missing, fill = key),
stat = 'identity') +
coord_flip() +
labs(x='variable',
y="number of missing values",
title='Visual of Missing Values') +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + theme_minimal()
omit na’s from df
df02 <- na.omit(df01)
#revised dataframe
df02
## # A tibble: 179 × 20
## Country Region Population Area_sq.mi pop.density_sq.… Coastline
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan ASIA (EX.… 31056997 647500 48 0
## 2 Albania EASTERN E… 3581655 28748 125. 1.26
## 3 Algeria NORTHERN … 32930091 2381740 13.8 0.04
## 4 Anguilla LATIN AME… 13477 102 132. 59.8
## 5 Antigua & Barbuda LATIN AME… 69108 443 156 34.5
## 6 Argentina LATIN AME… 39921833 2766890 14.4 0.18
## 7 Armenia C.W. OF I… 2976372 29800 99.9 0
## 8 Aruba LATIN AME… 71891 193 372. 35.5
## 9 Australia OCEANIA 20264082 7686850 2.6 0.34
## 10 Austria WESTERN E… 8192880 83870 97.7 0
## # … with 169 more rows, and 14 more variables: Net migration <dbl>,
## # inf.mortality_per1000births <dbl>, GDP_percapita <dbl>, Literacy.pct <dbl>,
## # Phones_per1000 <dbl>, Arable_pct <dbl>, Crops_pct <dbl>, Other_pct <dbl>,
## # Climate <dbl>, Birthrate <dbl>, Deathrate <dbl>, Agriculture <dbl>,
## # Industry <dbl>, Service <dbl>
Correlation of variables
The factors that correlate with GDP per capita the most are net migration, infant mortality, literacy (%), phones (per 1000), climate, Birthrate, agriculture, service
df02.cor <- subset(df02,
select = -c(Country, Region)) #remove non-numeric columns
ggcorrplot::ggcorrplot(cor(df02.cor), tl.cex = 12)
Creating correlation dataframe
corr <- cor(df02.cor$GDP_percapita,df02.cor[, unlist(lapply(df02.cor,is.numeric))])
corr <- as.tibble(t(corr))
correlation <- data.frame(colnames(df02.cor[, unlist(lapply(df02.cor,is.numeric))]), corr)
correlation <- correlation %>%
rename(Features = colnames.df02.cor...unlist.lapply.df02.cor..is.numeric...., Correlation = V1) %>%
mutate(abs.correlation = abs(Correlation)) %>%
arrange(desc(abs.correlation))
correlation
## Features Correlation abs.correlation
## 1 GDP_percapita 1.00000000 1.00000000
## 2 Phones_per1000 0.88352011 0.88352011
## 3 Birthrate -0.65879545 0.65879545
## 4 inf.mortality_per1000births -0.63908984 0.63908984
## 5 Agriculture -0.61691880 0.61691880
## 6 Service 0.53655075 0.53655075
## 7 Literacy.pct 0.52288013 0.52288013
## 8 Net migration 0.37879042 0.37879042
## 9 Climate 0.36056651 0.36056651
## 10 Deathrate -0.24756242 0.24756242
## 11 Crops_pct -0.20784374 0.20784374
## 12 pop.density_sq.mi 0.19012217 0.19012217
## 13 Area_sq.mi 0.06835595 0.06835595
## 14 Other_pct 0.06644526 0.06644526
## 15 Arable_pct 0.04646475 0.04646475
## 16 Coastline 0.03581518 0.03581518
## 17 Population -0.03361824 0.03361824
## 18 Industry 0.03285465 0.03285465
log phones? – will be logged log arable –weak correlation. not necessary. will be dropped log agriculture – will be logged log deathrate – will be logged - not a strong predictor
#gdp looks right skewed, transform by using the logged values.
library(ggplot2)
df02 %>%
keep(is.numeric) %>% # Keep only numeric columns
gather() %>% # Convert to key-value pairs
ggplot(aes(value)) + # Plot the values
facet_wrap(~ key, scales = "free") + # In separate panels
geom_density() + # as density
theme_minimal()
ggplot(data = df02.cor,
aes(x = log10(GDP_percapita),
y = `Net migration`)) +
geom_point() +
geom_smooth() + geom_smooth(method = "lm", color = "red") +
labs(x = "log10 gdp per capita",
y = "Net migration",
title = "Plot of gdp per capita and net migration")
#-----will be logged, relation will improve since inf. mortality is right skewed.
ggplot(data = df02.cor,
aes(x = log10(GDP_percapita),
y = inf.mortality_per1000births)) +
geom_point() +
geom_smooth() + geom_smooth(method = "lm", color = "red") +
labs(x = "log10 gdp per capita",
y = "Infant mortality (per 1000 births)",
title = "Plot of gdp per capita and Infant mortality (per 1000 births)")
#-----
ggplot(data = df02.cor,
aes(x = log10(GDP_percapita),
y = Literacy.pct)) +
geom_point() +
geom_smooth() + geom_smooth(method = "lm", color = "red") +
labs(x = "log10 gdp per capita",
y = "Literacy (%)",
title = "Plot of gdp per capita and Literacy (%)")
#-----Will be logged to improve
ggplot(data = df02.cor,
aes(x = log10(GDP_percapita),
y = Phones_per1000)) +
geom_point() +
geom_smooth() + geom_smooth(method = "lm", color = "red") +
labs(x = "log10 gdp per capita",
y = "Phones (per 1000)",
title = "Plot of gdp per capita and Phones (per 1000)")
#-----
ggplot(data = df02.cor,
aes(x = log10(GDP_percapita),
y = Climate, group = Climate)) +
geom_boxplot() +
labs(x = "log10 gdp per capita",
y = "Climate",
title = "Plot of gdp per capita and Climate")
#-----Will be logged to improve
ggplot(data = df02.cor,
aes(x = log10(GDP_percapita),
y = Birthrate)) +
geom_point() +
geom_smooth() + geom_smooth(method = "lm", color = "red") +
labs(x = "log10 gdp per capita",
y = "Birthrate",
title = "Plot of gdp per capita and Birthrate")
#----- Will be logged to improve
ggplot(data = df02.cor,
aes(x = log10(GDP_percapita),
y = Agriculture)) +
geom_point() +
geom_smooth() + geom_smooth(method = "lm", color = "red") +
labs(x = "log10 gdp per capita",
y = "Agriculture",
title = "Plot of gdp per capita and Agriculture")
#-----
ggplot(data = df02.cor,
aes(x = log10(GDP_percapita),
y = Service)) +
geom_point() +
geom_smooth() + geom_smooth(method = "lm", color = "red") +
labs(x = "log10 gdp per capita",
y = "Service",
title = "Plot of gdp per capita and Service")
#-----
ggplot(data = df02.cor,
aes(x = log10(GDP_percapita),
y = log10(inf.mortality_per1000births))) +
geom_point() +
geom_smooth() +
geom_smooth(method = "lm", color = "red") +
labs(x = "log10 gdp per capita",
y = "Infant mortality (per 1000 births)",
title = "Plot of gdp percapita and log10 infant mortality(per1000births)")
#-----
ggplot(data = df02.cor,
aes(x = log10(GDP_percapita),
y = log10(Phones_per1000))) +
geom_point() +
geom_smooth() + geom_smooth(method = "lm", color = "red") +
labs(x = "log10 gdp per capita",
y = "Phones (per 1000)",
title = "Plot of gdp per capita and log10 Phones (per 1000)")
#-----
ggplot(data = df02.cor,
aes(x = log10(GDP_percapita),
y = log10(Birthrate))) +
geom_point() +
geom_smooth() + geom_smooth(method = "lm", color = "red") +
labs(x = "log10 gdp per capita",
y = "Birthrate",
title = "Plot of gdp per capita and log10 Birthrate")
#-----
ggplot(data = df02.cor,
aes(x = log10(GDP_percapita),
y = log10(Agriculture))) +
geom_point() +
geom_smooth() + geom_smooth(method = "lm", color = "red") +
labs(x = "log10 gdp per capita",
y = "Agriculture",
title = "Plot of gdp per capita and log10 Agriculture")
#-----
ggplot(data = df02.cor,
aes(x = log10(GDP_percapita),
y = log10(Population))) +
geom_point() +
geom_smooth() + geom_smooth(method = "lm", color = "red") +
labs(x = "log10 gdp per capita",
y = "Population",
title = "Plot of gdp per capita and log10 Population")
#-----
ggplot(data = df02.cor,
aes(x = log10(GDP_percapita),
y = log10(Area_sq.mi))) +
geom_point() +
geom_smooth() + geom_smooth(method = "lm", color = "red") +
labs(x = "log10 gdp per capita",
y = "Area_sq.mi",
title = "Plot of gdp per capita and log10 Area_sq.mi")
#-----
ggplot(data = df02.cor,
aes(x = log10(GDP_percapita),
y = log10(pop.density_sq.mi))) +
geom_point() +
geom_smooth() + geom_smooth(method = "lm", color = "red") +
labs(x = "log10 gdp per capita",
y = "pop.density_sq.mi",
title = "Plot of gdp per capita and log10 pop.density_sq.mi")
#add logged data columns into dataframe
df03 <- df02.cor
df03 <- df03 %>%
add_column(loggdp_percapita = log10(df03$GDP_percapita),
loginf.mortality_per1000births = log10(df03$inf.mortality_per1000births),
logBirthrate = log10(df03$Birthrate),
logAgriculture = log10(df03$Agriculture),
logArea_sq.mi = log10(df03$Area_sq.mi),
logpop.density_sq.mi = log10(df03$pop.density_sq.mi))
df03
## # A tibble: 179 × 24
## Population Area_sq.mi pop.density_sq.mi Coastline `Net migration`
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 31056997 647500 48 0 23.1
## 2 3581655 28748 125. 1.26 -4.93
## 3 32930091 2381740 13.8 0.04 -0.39
## 4 13477 102 132. 59.8 10.8
## 5 69108 443 156 34.5 -6.15
## 6 39921833 2766890 14.4 0.18 0.61
## 7 2976372 29800 99.9 0 -6.47
## 8 71891 193 372. 35.5 0
## 9 20264082 7686850 2.6 0.34 3.98
## 10 8192880 83870 97.7 0 2
## # … with 169 more rows, and 19 more variables:
## # inf.mortality_per1000births <dbl>, GDP_percapita <dbl>, Literacy.pct <dbl>,
## # Phones_per1000 <dbl>, Arable_pct <dbl>, Crops_pct <dbl>, Other_pct <dbl>,
## # Climate <dbl>, Birthrate <dbl>, Deathrate <dbl>, Agriculture <dbl>,
## # Industry <dbl>, Service <dbl>, loggdp_percapita <dbl>,
## # loginf.mortality_per1000births <dbl>, logBirthrate <dbl>,
## # logAgriculture <dbl>, logArea_sq.mi <dbl>, logpop.density_sq.mi <dbl>
re-run correlation: high correlation – logbirthrate, loginf.mortality_per1000births, agricultre,service
df03.cor <- df03 %>%
filter(logAgriculture != "-Inf")
df03.cor <- subset(df03.cor)
ggcorrplot::ggcorrplot(cor(df03.cor), tl.cex = 12)
visualization
df02 %>%
ggplot(aes(x= Region, y=Area_sq.mi, group = Region)) +
geom_violin() +
theme(axis.text.x = element_text(color = "black",
size = 7, angle=30,
vjust=.8, hjust=0.8))
df02 %>%
ggplot(aes(x=Region, y=Birthrate, group = Region, fill = Region)) +
geom_violin() +
theme(axis.text.x = element_text(color = "black",
size = 7, angle=30,
vjust=.8, hjust=0.8),
legend.position = "none")
df02 %>%
ggplot(aes(x=Region, y=GDP_percapita, group = Region, fill = Region)) +
geom_violin() +
theme(axis.text.x = element_text(color = "black",
size = 7, angle=30,
vjust=.8, hjust=0.8),
legend.position = "none")
df03.cor %>%
ggplot(aes(x=Climate, y=loginf.mortality_per1000births,
group = Climate, fill = Climate)) +
geom_violin() +
theme(axis.text.x = element_text(color = "black",
size = 7, angle=30,
vjust=.8, hjust=0.8),
legend.position = "none") +
scale_fill_viridis(discrete = FALSE, option = "H") + theme_minimal()
df03$Country <- df02$Country
df03$Region <- df02$Region
df03 %>%
ggplot(aes(x=loggdp_percapita, y=Region, fill = Region)) +
geom_density_ridges() + theme_ridges() + theme(legend.position = "none")
df03 %>%
ggplot(aes(x=loggdp_percapita, y=logAgriculture,
size = loggdp_percapita, color = Region)) +
geom_point() + theme_ipsum()
df03 %>%
ggplot(aes(x=loggdp_percapita, y=Service,
size = loggdp_percapita, color = Region)) +
geom_point() + theme_ipsum()
#pop
df03 %>%
ggplot(aes(x = Region, fill = Region)) +
geom_bar(aes(group = mean(Population))) + labs(x = "Regions") +
theme(legend.position = "none",
axis.text.x = element_text(color = "black",
size = 6, angle=30,
vjust=.8, hjust=0.8))
df03 %>%
ggplot() +
geom_bar(aes(reorder(x=Region, Region, function(x) - length(x)),
fill = Region)) +
labs(x = "Regions") + theme(legend.position = "none",
axis.text.x = element_text(color = "black",
size = 6, angle=30,
vjust=.8, hjust=0.8))
df03 %>%
ggplot() +
geom_point(aes(x=GDP_percapita, y= Literacy.pct,
color = as.factor(Region))) +
facet_wrap(Region~.) + theme_classic() + theme(legend.position = "none")
df03 %>%
ggplot() +
geom_point(aes(x=GDP_percapita, y= Industry,
color = as.factor(Region))) +
facet_wrap(Region~.) + theme_classic() + theme(legend.position = "none")
df03 %>%
ggplot() +
geom_point(aes(x=GDP_percapita, y= Service,
color = as.factor(Region))) +
facet_wrap(Region~.) + theme_classic() + theme(legend.position = "none")
df03 %>%
ggplot() +
geom_point(aes(x=GDP_percapita, y= Agriculture,
color = as.factor(Region))) +
facet_wrap(Region~.) + theme_classic() + theme(legend.position = "none")
so, the strongest correlator to log10gdp_percapita are loginf.mortality_per1000births, phones (per 1000), birth rate, agriculture, then Literacy.pct , service, Deathrate, Climate, logArea_sq.mi, Net migration,logpop.density_sq.mi
Create absolute value correlation dataframe
fcorr <- cor(df03.cor$loggdp_percapita,df03.cor[, unlist(lapply(df03.cor,is.numeric))])
fcorr <- as.tibble(t(fcorr))
finalcorrelation <- data.frame(colnames(df03.cor[, unlist(lapply(df03.cor,is.numeric))]), fcorr)
finalcorrelation <- finalcorrelation %>%
rename(Features = colnames.df03.cor...unlist.lapply.df03.cor..is.numeric...., Correlation = V1) %>%
mutate(abs.correlation = abs(Correlation)) %>%
arrange(desc(abs.correlation))
finalcorrelation
## Features Correlation abs.correlation
## 1 loggdp_percapita 1.00000000 1.00000000
## 2 GDP_percapita 0.88807428 0.88807428
## 3 loginf.mortality_per1000births -0.88379315 0.88379315
## 4 Phones_per1000 0.84648739 0.84648739
## 5 Birthrate -0.83259356 0.83259356
## 6 logBirthrate -0.83057113 0.83057113
## 7 inf.mortality_per1000births -0.82806953 0.82806953
## 8 logAgriculture -0.81456340 0.81456340
## 9 Agriculture -0.78374530 0.78374530
## 10 Literacy.pct 0.68479871 0.68479871
## 11 Service 0.59007464 0.59007464
## 12 Deathrate -0.39242424 0.39242424
## 13 Climate 0.34482417 0.34482417
## 14 logArea_sq.mi -0.24482617 0.24482617
## 15 Net migration 0.22612005 0.22612005
## 16 logpop.density_sq.mi 0.19993760 0.19993760
## 17 Crops_pct -0.15236322 0.15236322
## 18 pop.density_sq.mi 0.15184349 0.15184349
## 19 Industry 0.14573437 0.14573437
## 20 Arable_pct 0.06439032 0.06439032
## 21 Area_sq.mi 0.04974957 0.04974957
## 22 Coastline 0.04571155 0.04571155
## 23 Other_pct 0.02405177 0.02405177
## 24 Population -0.01243059 0.01243059
Build simple model
lm01 <- lm(loggdp_percapita ~ loginf.mortality_per1000births, data = df03.cor)
summary(lm01)
##
## Call:
## lm(formula = loggdp_percapita ~ loginf.mortality_per1000births,
## data = df03.cor)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.80210 -0.13103 0.00578 0.11873 0.73945
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.05369 0.05729 88.21 <2e-16 ***
## loginf.mortality_per1000births -0.98475 0.03930 -25.06 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.24 on 176 degrees of freedom
## Multiple R-squared: 0.7811, Adjusted R-squared: 0.7798
## F-statistic: 628 on 1 and 176 DF, p-value: < 2.2e-16
#residuals look linear, no violation of model assumptions
df03.cor %>%
add_residuals(lm01, "resid") %>%
ggplot(aes(sample=resid)) +
geom_qq() +
theme_minimal() +
labs(x = "Theoretical Quantiles",
y = "Sample Quantilies",
title = "QQ Plot: Standardized residuals" )
# residuals look normally distributed
df03.cor %>%
add_residuals(lm01, "resid") %>%
ggplot(aes(x=resid)) +
geom_histogram(bins=10) +
labs(x="Residuals",
title = "Distribution of residuals") +
theme_minimal()
#residuals look random
df03.cor %>%
add_residuals(lm01, "resid") %>%
ggplot(aes(x=loginf.mortality_per1000births)) +
geom_point(aes(y=resid)) +
labs(x="Residuals",
title = "Distribution of residuals vs. infant mortality per 1000 births") +
theme_minimal()
See if any other variable can be added to model
#rename net migration
df03.cor <- df03.cor %>%
rename(net_migration='Net migration')
#----- random
df03.cor %>%
add_residuals(lm01, "resid") %>%
ggplot(aes(x=Birthrate)) +
geom_point(aes(y=resid)) +
labs(x="Birthrate",
y="Residuals",
title = "Residuals vs. Birthrate") +
theme_minimal()
#----- there is a strong clear negative relationship
df03.cor %>%
add_residuals(lm01, "resid") %>%
ggplot(aes(x=Phones_per1000)) +
geom_point(aes(y=resid)) +
labs(x="Phones (per 1000 births)",
y="Residuals",
title = "Residuals vs. Phones (per 1000 births)") +
theme_minimal()
#----- there is some sort of negative relationship
df03.cor %>%
add_residuals(lm01, "resid") %>%
ggplot(aes(x=logAgriculture)) +
geom_point(aes(y=resid)) +
labs(x="Agriculture",
y="Residuals",
title = "Residuals vs. Agriculture") +
theme_minimal()
#----- looks like there is some sort of pattern but its weak. most of the residuals are concentrated in one area
df03.cor %>%
add_residuals(lm01, "resid") %>%
ggplot(aes(x=Literacy.pct)) +
geom_point(aes(y=resid)) +
labs(x="Literacy (%)",
y="Residuals",
title = "Residuals vs. Literacy (%)") +
theme_minimal()
#------ doesn't look like it adds much to model
df03.cor %>%
add_residuals(lm01, "resid") %>%
ggplot(aes(x=Service)) +
geom_point(aes(y=resid)) +
labs(x="Service",
y="Residuals",
title = "Residuals vs. Service") +
theme_minimal()
#------ doesn't look like it adds much to model
df03.cor %>%
add_residuals(lm01, "resid") %>%
ggplot(aes(x=Climate)) +
geom_point(aes(y=resid)) +
labs(x="Service",
y="Residuals",
title = "Residuals vs. Service") +
theme_minimal()
#------ doesn't look like it adds much to model
df03.cor %>%
add_residuals(lm01, "resid") %>%
ggplot(aes(x=logArea_sq.mi)) +
geom_point(aes(y=resid)) +
labs(x="Service",
y="Residuals",
title = "Residuals vs. log10 Area_sq.mi") +
theme_minimal()
#------ doesn't look like it adds much to model
df03.cor %>%
add_residuals(lm01, "resid") %>%
ggplot(aes(x=logpop.density_sq.mi)) +
geom_point(aes(y=resid)) +
labs(x="Service",
y="Residuals",
title = "Residuals vs. log10 population density sq.mi") +
theme_minimal()
#------doesn't look like it adds much to model
df03.cor %>%
add_residuals(lm01, "resid") %>%
ggplot(aes(x=net_migration)) +
geom_point(aes(y=resid)) +
labs(x="Service",
y="Residuals",
title = "Residuals vs. Net Migration") +
theme_minimal()
2nd simple model
lm02 <- lm(loggdp_percapita ~ loginf.mortality_per1000births + logAgriculture, data = df03.cor)
summary(lm02)
##
## Call:
## lm(formula = loggdp_percapita ~ loginf.mortality_per1000births +
## logAgriculture, data = df03.cor)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6968 -0.1128 0.0228 0.1319 0.5280
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.30313 0.10685 40.271 < 2e-16 ***
## loginf.mortality_per1000births -0.69169 0.05014 -13.796 < 2e-16 ***
## logAgriculture -0.32584 0.04116 -7.917 2.68e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2065 on 175 degrees of freedom
## Multiple R-squared: 0.8388, Adjusted R-squared: 0.837
## F-statistic: 455.4 on 2 and 175 DF, p-value: < 2.2e-16
the remaining variables all look random with the exception of ogibes
#----- doesn't look like it adds much to model
df03.cor %>%
add_residuals(lm02, "resid") %>%
ggplot(aes(x=Birthrate)) +
geom_point(aes(y=resid)) +
labs(x="Birthrate",
y="Residuals",
title = "Residuals vs. Birthrate") +
theme_minimal()
#----- looks negatively correlated
df03.cor %>%
add_residuals(lm02, "resid") %>%
ggplot(aes(x=Phones_per1000)) +
geom_point(aes(y=resid)) +
labs(x="Phones per 1000",
y="Residuals",
title = "Residuals vs. Phones per 1000") +
theme_minimal()
#----- doesn't look like it adds much to model
df03.cor %>%
add_residuals(lm02, "resid") %>%
ggplot(aes(x=Literacy.pct)) +
geom_point(aes(y=resid)) +
labs(x="Literacy (%)",
y="Residuals",
title = "Residuals vs. Literacy (%)") +
theme_minimal()
#------ doesn't look like it adds much to model
df03.cor %>%
add_residuals(lm02, "resid") %>%
ggplot(aes(x=Service)) +
geom_point(aes(y=resid)) +
labs(x="Service",
y="Residuals",
title = "Residuals vs. Service") +
theme_minimal()
#------doesn't look like it adds much to model
df03.cor %>%
add_residuals(lm02, "resid") %>%
ggplot(aes(x=Climate)) +
geom_point(aes(y=resid)) +
labs(x="Service",
y="Residuals",
title = "Residuals vs. Service") +
theme_minimal()
#------
df03.cor %>%
add_residuals(lm02, "resid") %>%
ggplot(aes(x=logArea_sq.mi)) +
geom_boxplot(aes(y=resid)) +
labs(x="Service",
y="Residuals",
title = "Residuals vs. log10 Area_sq.mi") +
theme_minimal()
#------
df03.cor %>%
add_residuals(lm02, "resid") %>%
ggplot(aes(x=logpop.density_sq.mi)) +
geom_point(aes(y=resid)) +
labs(x="Service",
y="Residuals",
title = "Residuals vs. log10 population density sq.mi") +
theme_minimal()
#------
df03.cor %>%
add_residuals(lm02, "resid") %>%
ggplot(aes(x=net_migration)) +
geom_point(aes(y=resid)) +
labs(x="Service",
y="Residuals",
title = "Residuals vs. Net Migration") +
theme_minimal()
final simple model
lm03 <- lm(loggdp_percapita ~ Phones_per1000 + loginf.mortality_per1000births + logAgriculture, data = df03.cor)
summary(lm03)
##
## Call:
## lm(formula = loggdp_percapita ~ Phones_per1000 + loginf.mortality_per1000births +
## logAgriculture, data = df03.cor)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.48494 -0.10865 0.00893 0.13101 0.49842
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.9518192 0.1221741 32.346 < 2e-16 ***
## Phones_per1000 0.0006478 0.0001290 5.021 1.27e-06 ***
## loginf.mortality_per1000births -0.4936898 0.0613474 -8.047 1.27e-13 ***
## logAgriculture -0.2721211 0.0400316 -6.798 1.63e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1936 on 174 degrees of freedom
## Multiple R-squared: 0.8592, Adjusted R-squared: 0.8568
## F-statistic: 354 on 3 and 174 DF, p-value: < 2.2e-16
#residuals look linear, no violation of model assumptions
df03.cor %>%
add_residuals(lm03, "resid") %>%
ggplot(aes(sample=resid)) +
geom_qq() +
theme_minimal() +
labs(x = "Theoretical Quantiles",
y = "Sample Quantilies",
title = "QQ Plot: Standardized residuals" )
# residuals look normally distributed
df03.cor %>%
add_residuals(lm03, "resid") %>%
ggplot(aes(x=resid)) +
geom_histogram(bins=30) +
labs(x="Residuals",
title = "Distribution of residuals") +
theme_minimal()
kitchen sink
lm03.01 <- lm(loggdp_percapita ~. , data = df03.cor)
step(lm03.01)
## Start: AIC=-705.03
## loggdp_percapita ~ Population + Area_sq.mi + pop.density_sq.mi +
## Coastline + net_migration + inf.mortality_per1000births +
## GDP_percapita + Literacy.pct + Phones_per1000 + Arable_pct +
## Crops_pct + Other_pct + Climate + Birthrate + Deathrate +
## Agriculture + Industry + Service + loginf.mortality_per1000births +
## logBirthrate + logAgriculture + logArea_sq.mi + logpop.density_sq.mi
##
## Df Sum of Sq RSS AIC
## - logAgriculture 1 0.00004 2.5890 -707.03
## - Population 1 0.00005 2.5890 -707.03
## - Coastline 1 0.00013 2.5891 -707.02
## - Area_sq.mi 1 0.00104 2.5900 -706.96
## - Literacy.pct 1 0.00511 2.5941 -706.68
## - loginf.mortality_per1000births 1 0.00526 2.5943 -706.67
## - Phones_per1000 1 0.00675 2.5957 -706.57
## - logArea_sq.mi 1 0.00754 2.5965 -706.51
## - Industry 1 0.01271 2.6017 -706.16
## - net_migration 1 0.01392 2.6029 -706.08
## - Service 1 0.01564 2.6046 -705.96
## - pop.density_sq.mi 1 0.01774 2.6067 -705.82
## - logpop.density_sq.mi 1 0.02086 2.6098 -705.60
## - Agriculture 1 0.02418 2.6132 -705.38
## - Climate 1 0.02805 2.6170 -705.11
## <none> 2.5890 -705.03
## - Arable_pct 1 0.09539 2.6844 -700.59
## - Other_pct 1 0.09542 2.6844 -700.59
## - Crops_pct 1 0.09555 2.6845 -700.58
## - logBirthrate 1 0.11648 2.7055 -699.20
## - Deathrate 1 0.13329 2.7223 -698.10
## - inf.mortality_per1000births 1 0.22227 2.8113 -692.37
## - Birthrate 1 0.31080 2.8998 -686.85
## - GDP_percapita 1 1.36502 3.9540 -631.66
##
## Step: AIC=-707.03
## loggdp_percapita ~ Population + Area_sq.mi + pop.density_sq.mi +
## Coastline + net_migration + inf.mortality_per1000births +
## GDP_percapita + Literacy.pct + Phones_per1000 + Arable_pct +
## Crops_pct + Other_pct + Climate + Birthrate + Deathrate +
## Agriculture + Industry + Service + loginf.mortality_per1000births +
## logBirthrate + logArea_sq.mi + logpop.density_sq.mi
##
## Df Sum of Sq RSS AIC
## - Population 1 0.00006 2.5891 -709.02
## - Coastline 1 0.00012 2.5892 -709.02
## - Area_sq.mi 1 0.00102 2.5901 -708.96
## - Literacy.pct 1 0.00510 2.5941 -708.68
## - loginf.mortality_per1000births 1 0.00535 2.5944 -708.66
## - Phones_per1000 1 0.00671 2.5957 -708.57
## - logArea_sq.mi 1 0.00783 2.5969 -708.49
## - Industry 1 0.01268 2.6017 -708.16
## - net_migration 1 0.01403 2.6031 -708.07
## - Service 1 0.01563 2.6047 -707.96
## - pop.density_sq.mi 1 0.01997 2.6090 -707.66
## - logpop.density_sq.mi 1 0.02160 2.6106 -707.55
## - Agriculture 1 0.02437 2.6134 -707.36
## <none> 2.5890 -707.03
## - Climate 1 0.03067 2.6197 -706.93
## - Arable_pct 1 0.09572 2.6848 -702.57
## - Other_pct 1 0.09575 2.6848 -702.56
## - Crops_pct 1 0.09588 2.6849 -702.56
## - logBirthrate 1 0.12050 2.7095 -700.93
## - Deathrate 1 0.13568 2.7247 -699.94
## - inf.mortality_per1000births 1 0.22457 2.8136 -694.22
## - Birthrate 1 0.31915 2.9082 -688.34
## - GDP_percapita 1 1.67353 4.2626 -620.28
##
## Step: AIC=-709.02
## loggdp_percapita ~ Area_sq.mi + pop.density_sq.mi + Coastline +
## net_migration + inf.mortality_per1000births + GDP_percapita +
## Literacy.pct + Phones_per1000 + Arable_pct + Crops_pct +
## Other_pct + Climate + Birthrate + Deathrate + Agriculture +
## Industry + Service + loginf.mortality_per1000births + logBirthrate +
## logArea_sq.mi + logpop.density_sq.mi
##
## Df Sum of Sq RSS AIC
## - Coastline 1 0.00012 2.5892 -711.02
## - Area_sq.mi 1 0.00189 2.5910 -710.90
## - Literacy.pct 1 0.00505 2.5941 -710.68
## - loginf.mortality_per1000births 1 0.00537 2.5945 -710.66
## - Phones_per1000 1 0.00673 2.5958 -710.56
## - logArea_sq.mi 1 0.00804 2.5971 -710.47
## - Industry 1 0.01263 2.6017 -710.16
## - net_migration 1 0.01398 2.6031 -710.07
## - Service 1 0.01557 2.6047 -709.96
## - pop.density_sq.mi 1 0.02018 2.6093 -709.64
## - logpop.density_sq.mi 1 0.02382 2.6129 -709.40
## - Agriculture 1 0.02431 2.6134 -709.36
## <none> 2.5891 -709.02
## - Climate 1 0.03078 2.6199 -708.92
## - Arable_pct 1 0.09596 2.6851 -704.55
## - Other_pct 1 0.09600 2.6851 -704.54
## - Crops_pct 1 0.09613 2.6852 -704.54
## - logBirthrate 1 0.12069 2.7098 -702.92
## - Deathrate 1 0.13576 2.7248 -701.93
## - inf.mortality_per1000births 1 0.22539 2.8145 -696.17
## - Birthrate 1 0.32211 2.9112 -690.15
## - GDP_percapita 1 1.68471 4.2738 -621.81
##
## Step: AIC=-711.02
## loggdp_percapita ~ Area_sq.mi + pop.density_sq.mi + net_migration +
## inf.mortality_per1000births + GDP_percapita + Literacy.pct +
## Phones_per1000 + Arable_pct + Crops_pct + Other_pct + Climate +
## Birthrate + Deathrate + Agriculture + Industry + Service +
## loginf.mortality_per1000births + logBirthrate + logArea_sq.mi +
## logpop.density_sq.mi
##
## Df Sum of Sq RSS AIC
## - Area_sq.mi 1 0.00183 2.5910 -712.89
## - Literacy.pct 1 0.00504 2.5942 -712.67
## - loginf.mortality_per1000births 1 0.00558 2.5948 -712.63
## - Phones_per1000 1 0.00670 2.5959 -712.56
## - logArea_sq.mi 1 0.00860 2.5978 -712.43
## - Industry 1 0.01261 2.6018 -712.15
## - net_migration 1 0.01401 2.6032 -712.06
## - Service 1 0.01553 2.6047 -711.95
## - pop.density_sq.mi 1 0.02052 2.6097 -711.61
## - logpop.density_sq.mi 1 0.02385 2.6131 -711.38
## - Agriculture 1 0.02427 2.6135 -711.36
## <none> 2.5892 -711.02
## - Climate 1 0.03094 2.6202 -710.90
## - Arable_pct 1 0.09600 2.6852 -706.54
## - Other_pct 1 0.09603 2.6852 -706.53
## - Crops_pct 1 0.09616 2.6854 -706.53
## - logBirthrate 1 0.12057 2.7098 -704.92
## - Deathrate 1 0.13565 2.7249 -703.93
## - inf.mortality_per1000births 1 0.22534 2.8146 -698.16
## - Birthrate 1 0.32203 2.9112 -692.15
## - GDP_percapita 1 1.71889 4.3081 -622.39
##
## Step: AIC=-712.89
## loggdp_percapita ~ pop.density_sq.mi + net_migration + inf.mortality_per1000births +
## GDP_percapita + Literacy.pct + Phones_per1000 + Arable_pct +
## Crops_pct + Other_pct + Climate + Birthrate + Deathrate +
## Agriculture + Industry + Service + loginf.mortality_per1000births +
## logBirthrate + logArea_sq.mi + logpop.density_sq.mi
##
## Df Sum of Sq RSS AIC
## - loginf.mortality_per1000births 1 0.00460 2.5956 -714.57
## - Literacy.pct 1 0.00486 2.5959 -714.56
## - Phones_per1000 1 0.00508 2.5961 -714.54
## - Industry 1 0.01248 2.6035 -714.04
## - net_migration 1 0.01374 2.6048 -713.95
## - Service 1 0.01539 2.6064 -713.84
## - pop.density_sq.mi 1 0.01904 2.6101 -713.59
## - logArea_sq.mi 1 0.02074 2.6118 -713.47
## - Agriculture 1 0.02410 2.6151 -713.24
## - logpop.density_sq.mi 1 0.02428 2.6153 -713.23
## <none> 2.5910 -712.89
## - Climate 1 0.02967 2.6207 -712.86
## - Arable_pct 1 0.09741 2.6884 -708.32
## - Other_pct 1 0.09745 2.6885 -708.32
## - Crops_pct 1 0.09758 2.6886 -708.31
## - logBirthrate 1 0.11955 2.7106 -706.86
## - Deathrate 1 0.13668 2.7277 -705.74
## - inf.mortality_per1000births 1 0.22351 2.8146 -700.16
## - Birthrate 1 0.32037 2.9114 -694.14
## - GDP_percapita 1 1.72813 4.3192 -623.93
##
## Step: AIC=-714.57
## loggdp_percapita ~ pop.density_sq.mi + net_migration + inf.mortality_per1000births +
## GDP_percapita + Literacy.pct + Phones_per1000 + Arable_pct +
## Crops_pct + Other_pct + Climate + Birthrate + Deathrate +
## Agriculture + Industry + Service + logBirthrate + logArea_sq.mi +
## logpop.density_sq.mi
##
## Df Sum of Sq RSS AIC
## - Literacy.pct 1 0.00464 2.6003 -716.26
## - Phones_per1000 1 0.00492 2.6006 -716.24
## - net_migration 1 0.01244 2.6081 -715.72
## - Industry 1 0.01273 2.6084 -715.70
## - Service 1 0.01582 2.6115 -715.49
## - pop.density_sq.mi 1 0.01921 2.6149 -715.26
## - logArea_sq.mi 1 0.02170 2.6173 -715.09
## - Agriculture 1 0.02461 2.6203 -714.89
## - logpop.density_sq.mi 1 0.02584 2.6215 -714.81
## <none> 2.5956 -714.57
## - Climate 1 0.03395 2.6296 -714.26
## - Arable_pct 1 0.09849 2.6941 -709.95
## - Other_pct 1 0.09854 2.6942 -709.94
## - Crops_pct 1 0.09868 2.6943 -709.93
## - Deathrate 1 0.13581 2.7315 -707.50
## - logBirthrate 1 0.17332 2.7690 -705.07
## - inf.mortality_per1000births 1 0.34118 2.9368 -694.59
## - Birthrate 1 0.41478 3.0104 -690.19
## - GDP_percapita 1 2.01834 4.6140 -614.18
##
## Step: AIC=-716.26
## loggdp_percapita ~ pop.density_sq.mi + net_migration + inf.mortality_per1000births +
## GDP_percapita + Phones_per1000 + Arable_pct + Crops_pct +
## Other_pct + Climate + Birthrate + Deathrate + Agriculture +
## Industry + Service + logBirthrate + logArea_sq.mi + logpop.density_sq.mi
##
## Df Sum of Sq RSS AIC
## - Phones_per1000 1 0.00455 2.6048 -717.95
## - Industry 1 0.01141 2.6117 -717.48
## - net_migration 1 0.01279 2.6131 -717.38
## - Service 1 0.01434 2.6146 -717.28
## - pop.density_sq.mi 1 0.01868 2.6190 -716.98
## - logArea_sq.mi 1 0.02127 2.6215 -716.81
## - Agriculture 1 0.02272 2.6230 -716.71
## - logpop.density_sq.mi 1 0.02392 2.6242 -716.63
## <none> 2.6003 -716.26
## - Climate 1 0.03740 2.6377 -715.72
## - Arable_pct 1 0.09752 2.6978 -711.70
## - Other_pct 1 0.09757 2.6978 -711.70
## - Crops_pct 1 0.09772 2.6980 -711.69
## - Deathrate 1 0.13284 2.7331 -709.39
## - logBirthrate 1 0.16989 2.7702 -706.99
## - inf.mortality_per1000births 1 0.33662 2.9369 -696.59
## - Birthrate 1 0.41679 3.0171 -691.79
## - GDP_percapita 1 2.02332 4.6236 -615.81
##
## Step: AIC=-717.95
## loggdp_percapita ~ pop.density_sq.mi + net_migration + inf.mortality_per1000births +
## GDP_percapita + Arable_pct + Crops_pct + Other_pct + Climate +
## Birthrate + Deathrate + Agriculture + Industry + Service +
## logBirthrate + logArea_sq.mi + logpop.density_sq.mi
##
## Df Sum of Sq RSS AIC
## - Industry 1 0.0104 2.6153 -719.23
## - net_migration 1 0.0112 2.6160 -719.18
## - Service 1 0.0130 2.6178 -719.06
## - pop.density_sq.mi 1 0.0156 2.6204 -718.88
## - Agriculture 1 0.0213 2.6261 -718.50
## - logpop.density_sq.mi 1 0.0246 2.6295 -718.27
## <none> 2.6048 -717.95
## - logArea_sq.mi 1 0.0302 2.6350 -717.89
## - Climate 1 0.0375 2.6423 -717.40
## - Arable_pct 1 0.0937 2.6985 -713.66
## - Other_pct 1 0.0937 2.6986 -713.65
## - Crops_pct 1 0.0939 2.6987 -713.64
## - Deathrate 1 0.1290 2.7338 -711.35
## - logBirthrate 1 0.1656 2.7705 -708.97
## - inf.mortality_per1000births 1 0.3342 2.9390 -698.46
## - Birthrate 1 0.4122 3.0171 -693.79
## - GDP_percapita 1 4.2706 6.8754 -547.18
##
## Step: AIC=-719.23
## loggdp_percapita ~ pop.density_sq.mi + net_migration + inf.mortality_per1000births +
## GDP_percapita + Arable_pct + Crops_pct + Other_pct + Climate +
## Birthrate + Deathrate + Agriculture + Service + logBirthrate +
## logArea_sq.mi + logpop.density_sq.mi
##
## Df Sum of Sq RSS AIC
## - net_migration 1 0.0119 2.6272 -720.43
## - pop.density_sq.mi 1 0.0164 2.6317 -720.12
## - logpop.density_sq.mi 1 0.0281 2.6434 -719.33
## <none> 2.6153 -719.23
## - logArea_sq.mi 1 0.0338 2.6490 -718.95
## - Climate 1 0.0362 2.6515 -718.78
## - Service 1 0.0389 2.6542 -718.60
## - Arable_pct 1 0.0911 2.7064 -715.14
## - Other_pct 1 0.0912 2.7064 -715.13
## - Crops_pct 1 0.0913 2.7066 -715.12
## - Deathrate 1 0.1274 2.7427 -712.76
## - logBirthrate 1 0.1690 2.7843 -710.09
## - Agriculture 1 0.2994 2.9147 -701.94
## - inf.mortality_per1000births 1 0.3487 2.9640 -698.96
## - Birthrate 1 0.4112 3.0265 -695.24
## - GDP_percapita 1 4.2604 6.8756 -549.18
##
## Step: AIC=-720.43
## loggdp_percapita ~ pop.density_sq.mi + inf.mortality_per1000births +
## GDP_percapita + Arable_pct + Crops_pct + Other_pct + Climate +
## Birthrate + Deathrate + Agriculture + Service + logBirthrate +
## logArea_sq.mi + logpop.density_sq.mi
##
## Df Sum of Sq RSS AIC
## - pop.density_sq.mi 1 0.0169 2.6440 -721.29
## - logpop.density_sq.mi 1 0.0279 2.6551 -720.55
## <none> 2.6272 -720.43
## - logArea_sq.mi 1 0.0350 2.6622 -720.07
## - Service 1 0.0389 2.6660 -719.81
## - Climate 1 0.0438 2.6709 -719.49
## - Arable_pct 1 0.0876 2.7147 -716.59
## - Other_pct 1 0.0876 2.7148 -716.59
## - Crops_pct 1 0.0878 2.7150 -716.57
## - Deathrate 1 0.1213 2.7484 -714.39
## - logBirthrate 1 0.1636 2.7907 -711.67
## - Agriculture 1 0.2973 2.9245 -703.34
## - inf.mortality_per1000births 1 0.3400 2.9671 -700.77
## - Birthrate 1 0.4011 3.0283 -697.13
## - GDP_percapita 1 5.2077 7.8349 -527.93
##
## Step: AIC=-721.29
## loggdp_percapita ~ inf.mortality_per1000births + GDP_percapita +
## Arable_pct + Crops_pct + Other_pct + Climate + Birthrate +
## Deathrate + Agriculture + Service + logBirthrate + logArea_sq.mi +
## logpop.density_sq.mi
##
## Df Sum of Sq RSS AIC
## - logpop.density_sq.mi 1 0.0156 2.6596 -722.24
## <none> 2.6440 -721.29
## - Service 1 0.0327 2.6767 -721.10
## - logArea_sq.mi 1 0.0355 2.6795 -720.91
## - Climate 1 0.0509 2.6949 -719.89
## - Arable_pct 1 0.0884 2.7325 -717.43
## - Other_pct 1 0.0885 2.7325 -717.43
## - Crops_pct 1 0.0887 2.7327 -717.42
## - Deathrate 1 0.1197 2.7637 -715.41
## - logBirthrate 1 0.1469 2.7909 -713.66
## - Agriculture 1 0.2895 2.9335 -704.79
## - inf.mortality_per1000births 1 0.3396 2.9837 -701.78
## - Birthrate 1 0.3849 3.0289 -699.10
## - GDP_percapita 1 5.2272 7.8712 -529.11
##
## Step: AIC=-722.24
## loggdp_percapita ~ inf.mortality_per1000births + GDP_percapita +
## Arable_pct + Crops_pct + Other_pct + Climate + Birthrate +
## Deathrate + Agriculture + Service + logBirthrate + logArea_sq.mi
##
## Df Sum of Sq RSS AIC
## - logArea_sq.mi 1 0.0206 2.6803 -722.86
## <none> 2.6596 -722.24
## - Service 1 0.0318 2.6914 -722.12
## - Climate 1 0.0459 2.7055 -721.19
## - Other_pct 1 0.0927 2.7523 -718.14
## - Arable_pct 1 0.0927 2.7523 -718.14
## - Crops_pct 1 0.0929 2.7525 -718.13
## - Deathrate 1 0.1589 2.8186 -713.91
## - logBirthrate 1 0.1756 2.8352 -712.86
## - Agriculture 1 0.2795 2.9392 -706.45
## - inf.mortality_per1000births 1 0.3828 3.0425 -700.30
## - Birthrate 1 0.4191 3.0788 -698.19
## - GDP_percapita 1 5.2120 7.8716 -531.10
##
## Step: AIC=-722.86
## loggdp_percapita ~ inf.mortality_per1000births + GDP_percapita +
## Arable_pct + Crops_pct + Other_pct + Climate + Birthrate +
## Deathrate + Agriculture + Service + logBirthrate
##
## Df Sum of Sq RSS AIC
## - Service 1 0.0192 2.6995 -723.59
## <none> 2.6803 -722.86
## - Climate 1 0.0483 2.7286 -721.68
## - Other_pct 1 0.0929 2.7732 -718.79
## - Arable_pct 1 0.0930 2.7733 -718.79
## - Crops_pct 1 0.0931 2.7734 -718.79
## - Deathrate 1 0.1529 2.8332 -714.99
## - logBirthrate 1 0.1720 2.8523 -713.79
## - Agriculture 1 0.2813 2.9616 -707.10
## - inf.mortality_per1000births 1 0.3707 3.0510 -701.80
## - Birthrate 1 0.4201 3.1003 -698.95
## - GDP_percapita 1 5.2660 7.9463 -531.42
##
## Step: AIC=-723.59
## loggdp_percapita ~ inf.mortality_per1000births + GDP_percapita +
## Arable_pct + Crops_pct + Other_pct + Climate + Birthrate +
## Deathrate + Agriculture + logBirthrate
##
## Df Sum of Sq RSS AIC
## <none> 2.6995 -723.59
## - Climate 1 0.0488 2.7483 -722.40
## - Other_pct 1 0.0972 2.7967 -719.29
## - Arable_pct 1 0.0972 2.7968 -719.29
## - Crops_pct 1 0.0974 2.7969 -719.28
## - Deathrate 1 0.1567 2.8563 -715.54
## - logBirthrate 1 0.1809 2.8804 -714.04
## - Agriculture 1 0.2621 2.9616 -709.10
## - inf.mortality_per1000births 1 0.3547 3.0542 -703.62
## - Birthrate 1 0.4401 3.1396 -698.71
## - GDP_percapita 1 5.2866 7.9861 -532.53
##
## Call:
## lm(formula = loggdp_percapita ~ inf.mortality_per1000births +
## GDP_percapita + Arable_pct + Crops_pct + Other_pct + Climate +
## Birthrate + Deathrate + Agriculture + logBirthrate, data = df03.cor)
##
## Coefficients:
## (Intercept) inf.mortality_per1000births
## 2.716e+02 -3.870e-03
## GDP_percapita Arable_pct
## 2.966e-05 -2.686e+00
## Crops_pct Other_pct
## -2.687e+00 -2.685e+00
## Climate Birthrate
## -3.171e-02 -2.578e-02
## Deathrate Agriculture
## 9.141e-03 -4.289e-01
## logBirthrate
## 8.670e-01
kitchen sink model loggdp_percapita ~ net_migration + Phones_per1000 + Arable_pct + Crops_pct + Other_pct + Birthrate + Deathrate + Service + loginf.mortality_per1000births + logBirthrate + logAgriculture + logpop.density_sq.mi, data = df03.cor
model fit..of kitchen sink model
lm04 <- lm(loggdp_percapita ~ net_migration +
Phones_per1000 + Arable_pct + Crops_pct + Other_pct + Birthrate +
Deathrate + Service + loginf.mortality_per1000births +
logAgriculture + logpop.density_sq.mi, data = df03.cor)
summary(lm04)
##
## Call:
## lm(formula = loggdp_percapita ~ net_migration + Phones_per1000 +
## Arable_pct + Crops_pct + Other_pct + Birthrate + Deathrate +
## Service + loginf.mortality_per1000births + logAgriculture +
## logpop.density_sq.mi, data = df03.cor)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.48015 -0.11478 0.01303 0.11016 0.45185
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.012e+02 1.454e+02 2.758 0.006466 **
## net_migration 5.863e-03 3.201e-03 1.831 0.068824 .
## Phones_per1000 6.294e-04 1.220e-04 5.160 6.97e-07 ***
## Arable_pct -3.969e+00 1.455e+00 -2.729 0.007044 **
## Crops_pct -3.970e+00 1.454e+00 -2.730 0.007027 **
## Other_pct -3.970e+00 1.455e+00 -2.729 0.007032 **
## Birthrate -1.319e-02 2.312e-03 -5.702 5.30e-08 ***
## Deathrate -4.639e-04 3.093e-03 -0.150 0.880966
## Service -9.818e-02 1.063e-01 -0.923 0.357124
## loginf.mortality_per1000births -2.782e-01 7.191e-02 -3.869 0.000157 ***
## logAgriculture -2.671e-01 4.102e-02 -6.512 8.46e-10 ***
## logpop.density_sq.mi -9.388e-02 2.710e-02 -3.464 0.000678 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1688 on 166 degrees of freedom
## Multiple R-squared: 0.8979, Adjusted R-squared: 0.8911
## F-statistic: 132.7 on 11 and 166 DF, p-value: < 2.2e-16
extractAIC(lm04)
## [1] 12.0000 -621.7977
df03.cor %>%
add_residuals(lm04, "resid") %>%
ggplot(aes(sample=resid)) +
geom_qq() +
theme_minimal() +
labs(x = "Theoretical Quantiles",
y = "Sample Quantilies",
title = "QQ Plot: Standardized residuals")
df03.cor %>%
add_residuals(lm04, "resid") %>%
ggplot(aes(x=resid)) +
geom_histogram(bins=30) +
labs(x="Residuals",
title = "Distribution of residuals") +
theme_minimal()
5-k cross fold modeling
set.seed(2)
df04.cv <- crossv_kfold(df03.cor, k = 5)
df04.cv
## # A tibble: 5 × 3
## train test .id
## <named list> <named list> <chr>
## 1 <resample [142 x 24]> <resample [36 x 24]> 1
## 2 <resample [142 x 24]> <resample [36 x 24]> 2
## 3 <resample [142 x 24]> <resample [36 x 24]> 3
## 4 <resample [143 x 24]> <resample [35 x 24]> 4
## 5 <resample [143 x 24]> <resample [35 x 24]> 5
cv_qda1 <- df04.cv %>%
mutate(
fit = purrr::map(train,
~lm(loggdp_percapita ~ net_migration + loginf.mortality_per1000births + Phones_per1000 + Arable_pct +
Crops_pct + Other_pct + Birthrate +
Deathrate + Service +
loginf.mortality_per1000births + logAgriculture +
logpop.density_sq.mi,
data = .)),
rmse = purrr::map2_dbl(fit, test, ~rmse(.x,.y)))
cv_qda1
## # A tibble: 5 × 5
## train test .id fit rmse
## <named list> <named list> <chr> <named list> <dbl>
## 1 <resample [142 x 24]> <resample [36 x 24]> 1 <lm> 0.201
## 2 <resample [142 x 24]> <resample [36 x 24]> 2 <lm> 0.146
## 3 <resample [142 x 24]> <resample [36 x 24]> 3 <lm> 0.193
## 4 <resample [143 x 24]> <resample [35 x 24]> 4 <lm> 0.190
## 5 <resample [143 x 24]> <resample [35 x 24]> 5 <lm> 0.152
mean(cv_qda1$rmse)
## [1] 0.1765826
build rmse function to compare rmse of models
do_dataframe_cv1 <- function(formula) {
df04.cv %>%
mutate(fit = map(train, ~lm(formula, data = .)),
rmse = map2_dbl(fit, test, ~rmse(.x,.y))) %>%
summarize(cv_rmse = mean(rmse)) %>%
pull(cv_rmse)
}
RMSE of simple model: lm(log10gdp_percapita ~ Phones (per 1000), data = df03)
do_dataframe_cv1(loggdp_percapita ~ loginf.mortality_per1000births)
## [1] 0.2422357
RMSE of final simple model: lm(log10gdp_percapita ~ Phones (per 1000) + logInfant mortality (per 1000 births) + logAgriculture
do_dataframe_cv1(loggdp_percapita ~ loginf.mortality_per1000births + Phones_per1000 + logAgriculture)
## [1] 0.1957149
RMSE of kitchen sink model: loggdp_percapita ~ net_migration + loginf.mortality_per1000births + Phones_per1000 + Arable_pct + Crops_pct + Other_pct + Birthrate + Deathrate + Service + loginf.mortality_per1000births + logAgriculture + logpop.density_sq.mi
do_dataframe_cv1(loggdp_percapita ~ net_migration + loginf.mortality_per1000births +
Phones_per1000 + Arable_pct + Crops_pct + Other_pct + Birthrate +
Deathrate + Service + loginf.mortality_per1000births +
logAgriculture + logpop.density_sq.mi)
## [1] 0.1765826
greedy selection, start with creating a partion of dataset
df05.cv <- resample_partition(df03.cor,
p=c(train=0.6,
valid=0.20,
test=0.20))
df05.cv
## $train
## <resample [106 x 24]> 1, 3, 5, 6, 7, 11, 14, 15, 16, 17, ...
##
## $valid
## <resample [36 x 24]> 2, 4, 8, 12, 18, 22, 25, 28, 31, 38, ...
##
## $test
## <resample [36 x 24]> 9, 10, 13, 21, 23, 24, 27, 36, 45, 50, ...
In forward selection, we begin with an empty model (no candidate variables), and at each step, we add the variable that improves the model the most.
step1 <- function(response, predictors, candidates, partition) {
rhs <- paste0(paste0(predictors, collapse = "+"), "+", candidates)
formulas <- lapply(paste0(response, "~", rhs), as.formula)
rmses <- sapply(formulas,
function(fm) rmse(lm(fm, data = partition$train),
data = partition$valid))
names(rmses) <- candidates
attr(rmses, "best") <- rmses[which.min(rmses)]
rmses
}
initalize a variable for tracking out model
model <- NULL
Step 1 (no variables): the best one, and the first one to be added as a predictor is ‘log10(infant mortality rate (per 1,000 births))’; RMSE: 0.2654632
preds <- "1"
cands <- c("Population",
"logArea_sq.mi",
"Coastline",
"net_migration","loginf.mortality_per1000births",
"Literacy.pct", "Phones_per1000","Arable_pct",
"Crops_pct", "Other_pct","Climate","Birthrate",
"Deathrate","logAgriculture","Industry","Service",
"logpop.density_sq.mi")
s1 <- step1("loggdp_percapita", preds, cands, df05.cv)
model <- c(model, attr(s1, "best"))
s1
## Population logArea_sq.mi
## 0.4906678 0.4555911
## Coastline net_migration
## 0.7665041 0.4643247
## loginf.mortality_per1000births Literacy.pct
## 0.2775538 0.3649646
## Phones_per1000 Arable_pct
## 0.2351127 0.4826919
## Crops_pct Other_pct
## 0.4770081 0.4827703
## Climate Birthrate
## 0.4688727 0.2870815
## Deathrate logAgriculture
## 0.4164659 0.2434491
## Industry Service
## 0.4679956 0.4197100
## logpop.density_sq.mi
## 0.4604890
## attr(,"best")
## Phones_per1000
## 0.2351127
step 2 ()
preds <- "Phones_per1000"
cands <- c("Population",
"logArea_sq.mi",
"Coastline",
"net_migration","loginf.mortality_per1000births",
"Literacy.pct","Arable_pct",
"Crops_pct", "Other_pct","Climate","Birthrate",
"Deathrate","logAgriculture","Industry","Service",
"logpop.density_sq.mi")
s1 <- step1("loggdp_percapita", preds, cands, df05.cv)
model <- c(model, attr(s1, "best"))
s1
## Population logArea_sq.mi
## 0.2392479 0.2437711
## Coastline net_migration
## 0.2485474 0.2356203
## loginf.mortality_per1000births Literacy.pct
## 0.2384132 0.2325620
## Arable_pct Crops_pct
## 0.2392499 0.2335947
## Other_pct Climate
## 0.2361013 0.2356103
## Birthrate Deathrate
## 0.2076721 0.2251351
## logAgriculture Industry
## 0.1850448 0.2268648
## Service logpop.density_sq.mi
## 0.2389994 0.2343216
## attr(,"best")
## logAgriculture
## 0.1850448
preds <- c("logAgriculture","Phones_per1000")
cands <- c("Population",
"logArea_sq.mi",
"Coastline",
"net_migration","loginf.mortality_per1000births",
"Literacy.pct","Arable_pct",
"Crops_pct", "Other_pct","Climate","Birthrate",
"Deathrate","Industry","Service",
"logpop.density_sq.mi")
s1 <- step1("loggdp_percapita", preds, cands, df05.cv)
model <- c(model, attr(s1, "best"))
s1
## Population logArea_sq.mi
## 0.1870160 0.1905393
## Coastline net_migration
## 0.2901266 0.1823765
## loginf.mortality_per1000births Literacy.pct
## 0.1976153 0.1856909
## Arable_pct Crops_pct
## 0.1851106 0.1891013
## Other_pct Climate
## 0.1846467 0.1785219
## Birthrate Deathrate
## 0.1676147 0.1823447
## Industry Service
## 0.2011685 0.1866038
## logpop.density_sq.mi
## 0.1847536
## attr(,"best")
## Birthrate
## 0.1676147
preds <- c("Birthrate","Phones_per1000","logAgriculture")
cands <- c("Population",
"logArea_sq.mi",
"Coastline",
"net_migration","loginf.mortality_per1000births",
"Literacy.pct","Arable_pct",
"Crops_pct", "Other_pct","Climate",
"Deathrate","Industry","Service",
"logpop.density_sq.mi")
s1 <- step1("loggdp_percapita", preds, cands, df05.cv)
model <- c(model, attr(s1, "best"))
s1
## Population logArea_sq.mi
## 0.1666394 0.1636918
## Coastline net_migration
## 0.1909273 0.1709369
## loginf.mortality_per1000births Literacy.pct
## 0.1810549 0.1726926
## Arable_pct Crops_pct
## 0.1674778 0.1691904
## Other_pct Climate
## 0.1653895 0.1704070
## Deathrate Industry
## 0.1710802 0.1766612
## Service logpop.density_sq.mi
## 0.1689362 0.1531163
## attr(,"best")
## logpop.density_sq.mi
## 0.1531163
preds <- c("Birthrate","Phones_per1000","logAgriculture","logpop.density_sq.mi")
cands <- c("Population",
"logArea_sq.mi",
"Coastline",
"net_migration","loginf.mortality_per1000births",
"Literacy.pct","Arable_pct",
"Crops_pct", "Other_pct","Climate",
"Deathrate","Industry","Service")
s1 <- step1("loggdp_percapita", preds, cands, df05.cv)
model <- c(model, attr(s1, "best"))
s1
## Population logArea_sq.mi
## 0.1563595 0.1523853
## Coastline net_migration
## 0.1533584 0.1577634
## loginf.mortality_per1000births Literacy.pct
## 0.1609320 0.1556995
## Arable_pct Crops_pct
## 0.1515691 0.1610162
## Other_pct Climate
## 0.1527507 0.1567245
## Deathrate Industry
## 0.1567965 0.1606570
## Service
## 0.1536528
## attr(,"best")
## Arable_pct
## 0.1515691
preds <- c("Arable_pct","Phones_per1000","logAgriculture","logpop.density_sq.mi","Crops_pct")
cands <- c("Population",
"logArea_sq.mi",
"Coastline",
"net_migration","loginf.mortality_per1000births",
"Literacy.pct",
"Crops_pct", "Other_pct","Climate",
"Deathrate","Industry","Service")
s1 <- step1("loggdp_percapita", preds, cands, df05.cv)
model <- c(model, attr(s1, "best"))
s1
## Population logArea_sq.mi
## 0.1907350 0.1907232
## Coastline net_migration
## 0.1923280 0.1821928
## loginf.mortality_per1000births Literacy.pct
## 0.1769431 0.1798873
## Crops_pct Other_pct
## 0.1858573 0.1851522
## Climate Deathrate
## 0.1811446 0.1734087
## Industry Service
## 0.1924764 0.1857436
## attr(,"best")
## Deathrate
## 0.1734087
cut off model at net migration
step_model <- tibble(index = seq_along(model),
variable = factor(names(model), levels = names(model)),
RMSE=model)
ggplot(step_model, aes(y = RMSE)) +
geom_point(aes(x=variable)) +
geom_line(aes(x=index)) +
labs(title = "Stepwise Model Selection Plot",
x = "Predictor Variables") +
theme_classic() +
theme(axis.text.x = element_text(color = "black",
size = 9, angle=30,
vjust=.8, hjust=0.8))
set.seed(4)
modelstepwise <- lm(loggdp_percapita ~ Phones_per1000 + logAgriculture +
Birthrate + logpop.density_sq.mi + Arable_pct,
data=df05.cv$valid)
rmse(modelstepwise, df05.cv$valid)
## [1] 0.1430894
modelstepwisefinal <- lm(loggdp_percapita ~ Phones_per1000 + logAgriculture +
Birthrate + logpop.density_sq.mi + Arable_pct,
data=df05.cv$test)
rmse(modelstepwise, df05.cv$test)
## [1] 0.215647
summary(modelstepwise)
##
## Call:
## lm(formula = loggdp_percapita ~ Phones_per1000 + logAgriculture +
## Birthrate + logpop.density_sq.mi + Arable_pct, data = df05.cv$valid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.26546 -0.06345 -0.00269 0.10029 0.34787
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.7730597 0.1956711 19.283 < 2e-16 ***
## Phones_per1000 0.0009994 0.0002570 3.888 0.000519 ***
## logAgriculture -0.3634616 0.0786657 -4.620 6.79e-05 ***
## Birthrate -0.0148046 0.0038180 -3.878 0.000534 ***
## logpop.density_sq.mi -0.2071066 0.0727134 -2.848 0.007864 **
## Arable_pct 0.0036985 0.0026212 1.411 0.168533
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1567 on 30 degrees of freedom
## Multiple R-squared: 0.9117, Adjusted R-squared: 0.8969
## F-statistic: 61.92 on 5 and 30 DF, p-value: 6.738e-15
summary(modelstepwisefinal)
##
## Call:
## lm(formula = loggdp_percapita ~ Phones_per1000 + logAgriculture +
## Birthrate + logpop.density_sq.mi + Arable_pct, data = df05.cv$test)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.47776 -0.13738 0.00086 0.14894 0.26388
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.892e+00 2.357e-01 16.515 < 2e-16 ***
## Phones_per1000 9.018e-04 2.278e-04 3.958 0.000428 ***
## logAgriculture -1.844e-01 1.056e-01 -1.747 0.090942 .
## Birthrate -1.775e-02 5.412e-03 -3.280 0.002635 **
## logpop.density_sq.mi -9.355e-02 6.693e-02 -1.398 0.172420
## Arable_pct -8.006e-05 3.505e-03 -0.023 0.981925
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2069 on 30 degrees of freedom
## Multiple R-squared: 0.853, Adjusted R-squared: 0.8285
## F-statistic: 34.83 on 5 and 30 DF, p-value: 1.27e-11
df03.cor %>%
add_residuals(modelstepwise, "resid") %>%
ggplot(aes(sample=resid)) +
geom_qq() +
theme_minimal() +
labs(x = "Theoretical Quantiles",
y = "Sample Quantilies",
title = "QQ Plot: Standardized residuals")
df03.cor %>%
add_residuals(modelstepwise, "resid") %>%
ggplot(aes(x=resid)) +
geom_histogram(bins=30) +
labs(x="Residuals",
title = "Distribution of residuals") +
theme_minimal()
best predictive model for gdp per capita
do_dataframe_cv1(loggdp_percapita ~ Phones_per1000 + loginf.mortality_per1000births + logAgriculture)
## [1] 0.1957149
#vs.
do_dataframe_cv1(loggdp_percapita ~ Phones_per1000 + logAgriculture + Birthrate + logpop.density_sq.mi)
## [1] 0.183917
#remaining things to do: add notations to code chunks, do simple visualization for presentation,kint
lm5<- randomForest(loggdp_percapita ~., data = df03.cor, proximity = TRUE)
summary(lm5)
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 178 -none- numeric
## mse 500 -none- numeric
## rsq 500 -none- numeric
## oob.times 178 -none- numeric
## importance 23 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 31684 -none- numeric
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 178 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
plot(lm5)
print(lm5)
##
## Call:
## randomForest(formula = loggdp_percapita ~ ., data = df03.cor, proximity = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 7
##
## Mean of squared residuals: 0.007785018
## % Var explained: 97.01
rmse(lm5, df05.cv$valid)
## [1] 0.04277197
mae(lm5,df05.cv$valid)
## [1] 0.03305515
varImpPlot(lm5)
importance(lm5)
## IncNodePurity
## Population 0.08024075
## Area_sq.mi 0.06820220
## pop.density_sq.mi 0.07594269
## Coastline 0.06389513
## net_migration 0.74968611
## inf.mortality_per1000births 4.27324369
## GDP_percapita 16.45243249
## Literacy.pct 0.28473874
## Phones_per1000 7.90083221
## Arable_pct 0.10792763
## Crops_pct 0.16305132
## Other_pct 0.07920452
## Climate 0.05784808
## Birthrate 2.57563623
## Deathrate 0.19325727
## Agriculture 2.60165108
## Industry 0.15777569
## Service 0.23263785
## loginf.mortality_per1000births 3.72387853
## logBirthrate 2.46254464
## logAgriculture 3.24172447
## logArea_sq.mi 0.07456106
## logpop.density_sq.mi 0.08767843
library(vip)
vip(modelstepwise, aesthetics = list(fill = "purple")) + theme_classic()
finaldata <- df05.cv$test$data
finaldata$gdp_percapita <- df03.cor$GDP_percapita
pred_test <- data.frame(10^(predict(modelstepwise, newdata = finaldata)), finaldata$GDP_percapita)
colnames(pred_test) <- c('Predicted_Values', 'Actual_Values')
pred_test <- mutate(pred_test, Absolute_Difference = abs(Predicted_Values - Actual_Values))
pred_test
## Predicted_Values Actual_Values Absolute_Difference
## 1 862.5046 700 162.50461
## 2 3127.1417 4500 1372.85833
## 3 5433.5860 6000 566.41402
## 4 12355.3486 8600 3755.34862
## 5 15893.5304 11000 4893.53040
## 6 8372.0661 11200 2827.93392
## 7 4641.6092 3500 1141.60922
## 8 31883.9313 28000 3883.93133
## 9 41018.3656 29000 12018.36558
## 10 23997.5887 30000 6002.41130
## 11 3786.5588 3400 386.55876
## 12 17887.0179 16700 1187.01790
## 13 10272.7891 16900 6627.21087
## 14 1586.1098 1900 313.89016
## 15 11633.9327 15700 4066.06728
## 16 11476.7149 6100 5376.71489
## 17 23467.0068 29100 5632.99318
## 18 3574.3810 4900 1325.61899
## 19 1186.9527 1100 86.95272
## 20 41242.5121 36000 5242.51214
## 21 1463.2861 1300 163.28609
## 22 4416.5479 2400 2016.54787
## 23 10318.5481 9000 1318.54812
## 24 7786.7524 7600 186.75236
## 25 20685.0127 16000 4685.01271
## 26 7624.5249 18600 10975.47512
## 27 12929.3948 7600 5329.39476
## 28 963.3744 1100 136.62557
## 29 1917.3856 1800 117.38565
## 30 780.5352 600 180.53524
## 31 1699.3978 1900 200.60223
## 32 1341.5023 1800 458.49768
## 33 3353.1865 1400 1953.18645
## 34 44115.4844 35000 9115.48439
## 35 1604.8543 1100 504.85433
## 36 1250.1169 1200 50.11688
## 37 8694.4323 9900 1205.56768
## 38 6111.8958 5000 1111.89575
## 39 4521.2273 6300 1778.77272
## 40 1022.6582 700 322.65819
## 41 863.6166 700 163.61661
## 42 2362.1134 700 1662.11344
## 43 7052.9992 9100 2047.00084
## 44 1397.3649 1400 2.63512
## 45 6841.4175 2900 3941.41748
## 46 15736.8864 15700 36.88638
## 47 41793.2699 31100 10693.26988
## 48 1612.9964 1300 312.99642
## 49 5539.7841 5400 139.78409
## 50 3053.1773 6000 2946.82265
## 51 4593.7742 3300 1293.77422
## 52 3044.4681 4000 955.53189
## 53 3049.5680 4800 1750.43202
## 54 3707.1201 2700 1007.12005
## 55 2092.6165 700 1392.61650
## 56 16659.0837 12300 4359.08366
## 57 997.3513 700 297.35134
## 58 4197.1992 5800 1602.80076
## 59 23328.1811 27400 4071.81888
## 60 30496.2278 27600 2896.22778
## 61 12141.1103 8300 3841.11032
## 62 7855.3751 17500 9644.62486
## 63 3641.2615 5500 1858.73847
## 64 1115.6340 1700 584.36599
## 65 5099.9711 2500 2599.97109
## 66 49854.2439 27600 22254.24390
## 67 1400.0569 2200 799.94309
## 68 23046.9463 20000 3046.94626
## 69 6201.5705 5000 1201.57048
## 70 7188.0734 8000 811.92663
## 71 1896.9535 4100 2203.04650
## 72 1168.8516 2100 931.14844
## 73 1029.8544 800 229.85443
## 74 4972.7897 4000 972.78973
## 75 1104.6541 1600 495.34590
## 76 2472.9858 2600 127.01423
## 77 34112.8551 28800 5312.85505
## 78 17794.0989 13900 3894.09887
## 79 32375.0086 30900 1475.00861
## 80 2734.1284 2900 165.87157
## 81 2796.9692 3200 403.03076
## 82 6829.1101 7000 170.88994
## 83 2690.0748 1500 1190.07485
## 84 16731.1954 29600 12868.80463
## 85 12331.5133 19800 7468.48674
## 86 4239.3722 3900 339.37225
## 87 18193.8887 28200 10006.11127
## 88 5459.6771 4300 1159.67709
## 89 10032.8990 6300 3732.89905
## 90 1385.7132 1000 385.71316
## 91 2395.5542 1300 1095.55418
## 92 14275.2187 17800 3524.78125
## 93 12351.2488 19000 6648.75115
## 94 2610.6509 1600 1010.65085
## 95 1270.5097 1700 429.49029
## 96 17989.2017 10200 7789.20170
## 97 2394.5300 3000 605.47005
## 98 743.3385 1000 256.66145
## 99 18266.4173 25000 6733.58267
## 100 17817.7702 19400 1582.22981
## 101 1187.6213 800 387.62133
## 102 945.4627 600 345.46266
## 103 4341.5272 9000 4658.47277
## 104 1031.6614 3900 2868.33861
## 105 954.0438 900 54.04382
## 106 2960.5432 1600 1360.54325
## 107 8094.4480 14400 6305.55195
## 108 1985.8519 1800 185.85193
## 109 7692.8073 11400 3707.19266
## 110 7126.3163 9000 1873.68368
## 111 1934.1250 2000 65.87496
## 112 5103.3776 1800 3303.37762
## 113 1577.4165 1200 377.41655
## 114 5822.2335 7200 1377.76652
## 115 17481.0966 28600 11118.90345
## 116 15621.5322 11400 4221.53222
## 117 6889.5073 15000 8110.49274
## 118 19205.1169 21600 2394.88314
## 119 2850.6382 2300 550.63825
## 120 953.7661 800 153.76613
## 121 1149.7352 900 249.73525
## 122 27964.5069 37800 9835.49309
## 123 4476.9657 13100 8623.03428
## 124 1699.1262 2100 400.87377
## 125 9129.5242 9000 129.52419
## 126 5093.3996 6300 1206.60037
## 127 1948.1865 2200 251.81345
## 128 2548.6121 4700 2151.38786
## 129 4793.6170 5100 306.38303
## 130 2025.1319 4600 2574.86807
## 131 13898.8222 11100 2798.82219
## 132 13514.9431 18000 4485.05694
## 133 12592.7563 16800 4207.24374
## 134 23465.5849 21500 1965.58488
## 135 6398.8272 5800 598.82724
## 136 8236.6159 7000 1236.61595
## 137 893.5355 1300 406.46449
## 138 19736.0692 8800 10936.06917
## 139 5298.3792 5400 101.62082
## 140 4367.5399 2900 1467.53990
## 141 4547.2311 5600 1052.76890
## 142 1110.5380 1200 89.46197
## 143 6134.4160 11800 5665.58403
## 144 1841.5065 1600 241.50647
## 145 7635.5251 7800 164.47494
## 146 691.5215 500 191.52154
## 147 898.8130 500 398.81301
## 148 8216.7457 10700 2483.25430
## 149 19397.1190 22000 2602.88104
## 150 2589.1407 3700 1110.85932
## 151 1588.9470 1900 311.05300
## 152 8412.9916 4000 4412.99164
## 153 2489.8557 4900 2410.14430
## 154 63418.4064 26800 36618.40641
## 155 34921.7298 32700 2221.72981
## 156 2585.4414 3300 714.55858
## 157 20866.7869 23400 2533.21312
## 158 1670.6324 1000 670.63237
## 159 5192.8646 7400 2207.13541
## 160 1383.3201 1500 116.67990
## 161 2300.2709 2200 100.27085
## 162 17484.6571 9500 7984.65708
## 163 4797.4177 6900 2102.58230
## 164 6990.1774 6700 290.17739
## 165 3089.9702 5800 2710.02976
## 166 842.3678 1400 557.63217
## 167 9629.7151 5400 4229.71514
## 168 14712.2799 23200 8487.72012
## 169 38461.6690 27700 10761.66905
## 170 89126.1759 37800 51326.17588
## 171 9853.2618 12800 2946.73816
## 172 1929.0261 1700 229.02611
## 173 2726.7849 2900 173.21507
## 174 7157.0735 4800 2357.07345
## 175 3413.5589 2500 913.55887
## 176 1474.0189 800 674.01887
## 177 1563.4974 800 763.49740
## 178 2386.4932 1900 486.49322
mean(pred_test$Absolute_Difference)
## [1] 3094.235